Package/ Library¶

In [1]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
import seaborn as sns
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import statsmodels.discrete.discrete_model as sm
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Data Exploration & Profiling /EDA/¶

In [16]:
# 
CRisk = pd.read_csv(r"C:/Users/anten/OneDrive/Desktop/CIND 820 Project/Credit Risk Dataset/credit_risk_dataset.csv")
CRisk.head()
Out[16]:
person_age person_income person_home_ownership person_emp_length loan_intent loan_grade loan_amnt loan_int_rate loan_status loan_percent_income cb_person_cred_hist_length cb_person_default_on_file
0 22 59000 RENT 123.0 PERSONAL D 35000 16.02 1 0.59 3 Y
1 21 9600 OWN 5.0 EDUCATION B 1000 11.14 0 0.10 2 N
2 25 9600 MORTGAGE 1.0 MEDICAL C 5500 12.87 1 0.57 3 N
3 23 65500 RENT 4.0 MEDICAL C 35000 15.23 1 0.53 2 N
4 24 54400 RENT 8.0 MEDICAL C 35000 14.27 1 0.55 4 Y
In [17]:
# Exploratory Data Analysis 
Profile = ProfileReport(CRisk, title = 'Credit Risk dataset profiling Report', html ={'style':{'full_width': True}})
In [18]:
Profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[18]:

In [19]:
# Checking outlier among attributes 
# define a function called "plot_boxplot"
def plot_boxplot(df, ft):
    df.boxplot(column=[ft])
    plt.grid(False)
    plt.show()
In [20]:
plot_boxplot(CRisk, "person_age")
plot_boxplot(CRisk, "person_income")
plot_boxplot(CRisk, "person_emp_length")
plot_boxplot(CRisk, "loan_int_rate")
plot_boxplot(CRisk, "loan_percent_income")
plot_boxplot(CRisk, "cb_person_cred_hist_length")
plot_boxplot(CRisk, "loan_amnt")
In [21]:
# define a function called "outlers" which returns a list of index  of outliers

def outliers(df, ft):
    Q1 = df[ft].quantile(0.25)
    Q3 = df[ft].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    ls = df.index[(df[ft] < lower_bound) | (df[ft] > upper_bound)]
    
    return ls
In [22]:
index_list = []
for attribute in ["person_age", "person_income","person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income",
                  "cb_person_cred_hist_length"]:
    index_list.extend(outliers(CRisk, attribute))
In [23]:
# define a function called 'remove' which returns a cleaned  dataframe without outliers 
def remove(df, ls):
    ls = sorted(set(ls))
    df = df.drop(ls)
    return df
In [24]:
CRisk = remove(CRisk, index_list)
In [25]:
CRisk.shape
Out[25]:
(27024, 12)

Spliting the dataset (train, test and validation sets)¶

In [12]:
# Training set used to train moel
# Validation set used to evaluate the model during training, tune model hyperparameters
#  test set usded to compare different modlels o and to report the model accuracy
In [26]:
train_val_df, test_df = train_test_split(CRisk,  test_size = 0.2 )
train_df, val_df = train_test_split(train_val_df, test_size = 0.25)
In [27]:
print('train_df.shape:' , train_df.shape)
print('val_df.shape:' , val_df.shape)
print('test_df.shape:' , test_df.shape)
train_df.shape: (16214, 12)
val_df.shape: (5405, 12)
test_df.shape: (5405, 12)
In [28]:
# Selecting only the important columns (inputs for  modeling)
input_cols = list(train_df[['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent',
                         'loan_grade', 'loan_amnt', 'loan_int_rate','loan_percent_income',
                         'cb_person_cred_hist_length', 'cb_person_default_on_file' ]])
print(input_cols)
['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'cb_person_default_on_file']
In [29]:
# target columns 
target_col = 'loan_status'
print(target_col)
loan_status
In [30]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()
In [31]:
val_inputs =val_df[input_cols].copy()
val_targets = val_df[target_col].copy()
In [32]:
test_inputs = test_df[input_cols].copy()
test_targets = val_df[target_col].copy()
In [35]:
train_inputs = train_df[input_cols]
train_targets = train_df[target_col]
In [34]:
val_inputs =val_df[input_cols]
val_targets = val_df[target_col]
In [36]:
test_inputs = test_df[input_cols]
test_targets = val_df[target_col]
In [37]:
numeric_cols = train_inputs.select_dtypes(include = np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()
In [38]:
numeric_cols
Out[38]:
['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length']
In [39]:
categorical_cols
Out[39]:
['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']
In [40]:
train_inputs[categorical_cols].nunique()
Out[40]:
person_home_ownership        4
loan_intent                  6
loan_grade                   7
cb_person_default_on_file    2
dtype: int64
In [41]:
# checking missing value in train set 
train_inputs[numeric_cols].isna().sum()
Out[41]:
person_age                       0
person_income                    0
person_emp_length              495
loan_amnt                        0
loan_int_rate                 1532
loan_percent_income              0
cb_person_cred_hist_length       0
dtype: int64
In [42]:
# checking missing value in test set
test_inputs [numeric_cols].isna().sum()
Out[42]:
person_age                      0
person_income                   0
person_emp_length             141
loan_amnt                       0
loan_int_rate                 495
loan_percent_income             0
cb_person_cred_hist_length      0
dtype: int64

Missing Values handeling¶

In [43]:
# Missing values handeling using Sklearn  impute from from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')
In [44]:
imputer.fit(CRisk[numeric_cols])
Out[44]:
SimpleImputer()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer()
In [45]:
list(imputer.statistics_)
Out[45]:
[26.38306690349319,
 57535.372372705744,
 4.353270814272644,
 8418.799030491415,
 10.919090240013084,
 0.16262877442273535,
 5.0016281823564235]
In [ ]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])
In [47]:
train_inputs[numeric_cols].isna().sum()
Out[47]:
person_age                    0
person_income                 0
person_emp_length             0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_cred_hist_length    0
dtype: int64

Scaling Numeric features ( Normalization)¶

In [48]:
# Feature Scaling  using from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
In [49]:
scaler.fit(CRisk[numeric_cols])
Out[49]:
MinMaxScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MinMaxScaler()
In [ ]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])
In [51]:
train_inputs[numeric_cols].describe()
Out[51]:
person_age person_income person_emp_length loan_amnt loan_int_rate loan_percent_income cb_person_cred_hist_length
count 16214.000000 16214.000000 16214.000000 16214.000000 16214.000000 16214.000000 16214.000000
mean 0.318573 0.395210 0.311921 0.351804 0.335310 0.353428 0.231813
std 0.210992 0.196427 0.238465 0.216205 0.187395 0.216335 0.229107
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.150000 0.249551 0.142857 0.195556 0.151961 0.186047 0.076923
50% 0.250000 0.361691 0.285714 0.311111 0.336954 0.302326 0.153846
75% 0.450000 0.508519 0.428571 0.501667 0.468137 0.488372 0.384615
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

Standardization¶

In [52]:
# from sklearn.preprocessing import StandardScaler
In [53]:
scaler = StandardScaler()
In [54]:
scaler.fit_transform(CRisk[numeric_cols])
Out[54]:
array([[-1.27613037, -1.79695308,  0.19141909, ...,  0.06897353,
        -0.6722018 , -1.00988286],
       [-1.27613037, -1.78570698, -0.69652179, ..., -1.17992614,
         0.93776536, -1.00988286],
       [-1.27613037, -1.78195828,  0.48739939, ...,  1.19298324,
        -0.02821494, -0.67343784],
       ...,
       [ 3.2280821 , -0.91975739, -0.69652179, ...,         nan,
        -1.42351981,  3.36390242],
       [ 2.51689066,  1.21700134,  1.96730086, ..., -1.3422831 ,
        -1.31618867,  3.36390242],
       [ 2.75395447,  1.96674125,  0.19141909, ...,  0.33436471,
        -0.13554608,  2.69101237]])

Encoding categorical variables¶

In [55]:
# Encoding Categorical Variable using OneHotEncoder
CRisk[categorical_cols].nunique()
Out[55]:
person_home_ownership        4
loan_intent                  6
loan_grade                   7
cb_person_default_on_file    2
dtype: int64
In [56]:
CRisk["person_home_ownership"].value_counts()
Out[56]:
RENT        14221
MORTGAGE    10596
OWN          2123
OTHER          84
Name: person_home_ownership, dtype: int64
In [57]:
CRisk["loan_intent"].value_counts()
Out[57]:
EDUCATION            5537
MEDICAL              5061
VENTURE              4763
PERSONAL             4490
DEBTCONSOLIDATION    4314
HOMEIMPROVEMENT      2859
Name: loan_intent, dtype: int64
In [58]:
CRisk["loan_grade"].value_counts()
Out[58]:
A    9219
B    8550
C    5510
D    2835
E     729
F     150
G      31
Name: loan_grade, dtype: int64
In [59]:
CRisk["cb_person_default_on_file"].value_counts()
Out[59]:
N    22343
Y     4681
Name: cb_person_default_on_file, dtype: int64
In [60]:
encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
In [61]:
encoder.fit(CRisk[categorical_cols])
Out[61]:
OneHotEncoder(handle_unknown='ignore', sparse=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
OneHotEncoder(handle_unknown='ignore', sparse=False)
In [64]:
encoder.categories_
Out[64]:
[array(['MORTGAGE', 'OTHER', 'OWN', 'RENT'], dtype=object),
 array(['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL',
        'PERSONAL', 'VENTURE'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object),
 array(['N', 'Y'], dtype=object)]
In [65]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)
['person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_N', 'cb_person_default_on_file_Y']
In [66]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])
In [68]:
## Train_inputs = X_train, train_targets = Y_train  and test_inputs = X_test ,  and test_target = Y_test )
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)
train_inputs: (16214, 30)
train_targets: (16214,)
val_inputs: (5405, 30)
val_targets: (5405,)
test_inputs: (5405, 30)
test_targets: (5405,)

Modelling /Logistic regression/¶

In [69]:
# Logistic regression is a commonly used techniques for solving  binary classification problems
model = LogisticRegression(solver = 'liblinear')
In [70]:
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)
Out[70]:
LogisticRegression(solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(solver='liblinear')
In [71]:
print(numeric_cols + encoded_cols)
['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_N', 'cb_person_default_on_file_Y']
In [77]:
# Postive and Negative predicters of deafult 
print(model.coef_.tolist())
[[-0.3966632588158861, -0.349461744682226, -0.22479355317937583, -3.178825897398039, 1.0716674803028918, 6.043505575968809, 0.13774254214615003, -0.0018494442477760297, 0.1144859084628128, -1.7104156664478034, 0.6135744923117117, 0.19794626735315135, -0.5870843514028505, 0.4876678251023244, 0.011942042170631528, -0.2907054486905857, -0.8039710444555295, -1.9895767731684757, -1.7477572926961862, -1.5671262467873217, 0.4479693550089058, 0.7561135922390441, 0.8495202208734832, 2.266652434607905, -0.5309815984648207, -0.4532231114575803]]
In [78]:
print(model.intercept_)
[-0.98420471]
In [79]:
print(model.feature_names_in_)
['person_age' 'person_income' 'person_emp_length' 'loan_amnt'
 'loan_int_rate' 'loan_percent_income' 'cb_person_cred_hist_length'
 'person_home_ownership_MORTGAGE' 'person_home_ownership_OTHER'
 'person_home_ownership_OWN' 'person_home_ownership_RENT'
 'loan_intent_DEBTCONSOLIDATION' 'loan_intent_EDUCATION'
 'loan_intent_HOMEIMPROVEMENT' 'loan_intent_MEDICAL'
 'loan_intent_PERSONAL' 'loan_intent_VENTURE' 'loan_grade_A'
 'loan_grade_B' 'loan_grade_C' 'loan_grade_D' 'loan_grade_E'
 'loan_grade_F' 'loan_grade_G' 'cb_person_default_on_file_N'
 'cb_person_default_on_file_Y']
In [81]:
# # Attributes are positively correlated and negatively correlated : 
# the higher the weight of the predictor and more important, the lower the less important 

n = len(model.coef_.tolist())
pd.DataFrame({
    'Attributes':(numeric_cols + encoded_cols),
    'Predicters':model.coef_.tolist()[0]
    
})
Out[81]:
Attributes Predicters
0 person_age -0.396663
1 person_income -0.349462
2 person_emp_length -0.224794
3 loan_amnt -3.178826
4 loan_int_rate 1.071667
5 loan_percent_income 6.043506
6 cb_person_cred_hist_length 0.137743
7 person_home_ownership_MORTGAGE -0.001849
8 person_home_ownership_OTHER 0.114486
9 person_home_ownership_OWN -1.710416
10 person_home_ownership_RENT 0.613574
11 loan_intent_DEBTCONSOLIDATION 0.197946
12 loan_intent_EDUCATION -0.587084
13 loan_intent_HOMEIMPROVEMENT 0.487668
14 loan_intent_MEDICAL 0.011942
15 loan_intent_PERSONAL -0.290705
16 loan_intent_VENTURE -0.803971
17 loan_grade_A -1.989577
18 loan_grade_B -1.747757
19 loan_grade_C -1.567126
20 loan_grade_D 0.447969
21 loan_grade_E 0.756114
22 loan_grade_F 0.849520
23 loan_grade_G 2.266652
24 cb_person_default_on_file_N -0.530982
25 cb_person_default_on_file_Y -0.453223
In [82]:
# Visualizing the  logsitic regression coef
n = len(model.coef_.tolist())
weight_df = pd.DataFrame({
    'feature':(numeric_cols + encoded_cols),
    'weight':model.coef_.tolist()[0]
})
In [83]:
sns.barplot(data = weight_df.sort_values('weight', ascending = False).head(25), x = 'weight', y = 'feature')
Out[83]:
<AxesSubplot:xlabel='weight', ylabel='feature'>

Making prediction¶

In [86]:
#  Evaluating the model 
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]
In [87]:
train_preds = model.predict(X_train)
In [88]:
train_preds
Out[88]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [89]:
#  The model achives an accuracy of 86% of training set 
accuracy_score(train_targets, train_preds)
Out[89]:
0.8648081904526952
In [90]:
confusion_matrix(train_targets, train_preds)
Out[90]:
array([[12206,   596],
       [ 1596,  1816]], dtype=int64)
In [91]:
# Visualization 
def predict_and_plot(inputs, targets, name =''):
    preds = model.predict(inputs)
    
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    
    cf = confusion_matrix(targets, preds, normalize = 'true')
    plt.figure()
    sns.heatmap(cf, annot = True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

Accuracy on the training set¶

In [92]:
train_preds = predict_and_plot(X_train, train_targets, 'Training')
Accuracy: 86.48%

Accuracy on test set¶

In [93]:
 
test_preds = predict_and_plot(X_test, test_targets, 'Test')
Accuracy: 71.29%
In [94]:
print(classification_report(test_targets, test_preds))
              precision    recall  f1-score   support

           0       0.80      0.85      0.83      4297
           1       0.23      0.17      0.19      1108

    accuracy                           0.71      5405
   macro avg       0.51      0.51      0.51      5405
weighted avg       0.68      0.71      0.70      5405

Decision tree classifer¶

In [95]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(max_depth = 5)
dt_clf.fit(X_train, train_targets)
dt_clf.score(X_test, test_targets)
Out[95]:
0.7250693802035153
In [96]:
train_preds = dt_clf.predict(X_test)
dt_clf.score(X_test, test_targets)
confusion_matrix(test_targets, train_preds)
Out[96]:
array([[3761,  536],
       [ 950,  158]], dtype=int64)
In [97]:
print(classification_report(test_targets, train_preds))
              precision    recall  f1-score   support

           0       0.80      0.88      0.84      4297
           1       0.23      0.14      0.18      1108

    accuracy                           0.73      5405
   macro avg       0.51      0.51      0.51      5405
weighted avg       0.68      0.73      0.70      5405

Random Forest Classfier¶

In [98]:
from sklearn import ensemble
rf_clf = ensemble.RandomForestClassifier(n_estimators = 100)
rf_clf.fit(X_train, train_targets)
rf_clf.score(X_test, test_targets)
Out[98]:
0.7110083256244218
In [99]:
train_preds = rf_clf.predict(X_test)
rf_clf.score(X_test, test_targets)
confusion_matrix(test_targets, train_preds)
Out[99]:
array([[3671,  626],
       [ 936,  172]], dtype=int64)
In [100]:
print(classification_report(test_targets, train_preds))
              precision    recall  f1-score   support

           0       0.80      0.85      0.82      4297
           1       0.22      0.16      0.18      1108

    accuracy                           0.71      5405
   macro avg       0.51      0.50      0.50      5405
weighted avg       0.68      0.71      0.69      5405

Naive Bayes Classifier¶

In [101]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X_train, train_targets)
nb_clf.score(X_test, test_targets)
Out[101]:
0.7150786308973173
In [102]:
train_preds = nb_clf.predict(X_test)
nb_clf.score(X_test, test_targets)
confusion_matrix(test_targets, train_preds)
Out[102]:
array([[3685,  612],
       [ 928,  180]], dtype=int64)
In [103]:
print(classification_report(test_targets, train_preds))
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      4297
           1       0.23      0.16      0.19      1108

    accuracy                           0.72      5405
   macro avg       0.51      0.51      0.51      5405
weighted avg       0.68      0.72      0.70      5405